In [1]:
import sys
sys.path.append('..')
from twords.twords import Twords
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
# this pandas line makes the dataframe display all text in a line; useful for seeing entire tweets
pd.set_option('display.max_colwidth', -1)
In [2]:
twit = Twords()
# set path to folder that contains jar files for twitter search
twit.jar_folder_path = "../jar_files_and_background/"
In [ ]:
twit.get_all_user_tweets("realdonaldtrump", tweets_per_run=500)
In [3]:
twit.data_path = "realdonaldtrump"
twit.get_java_tweets_from_csv_list()
twit.convert_tweet_dates_to_standard()
To sort tweets by favorites or retweets, need to convert unicode to integers:
In [4]:
twit.tweets_df["retweets"] = twit.tweets_df["retweets"].map(int)
twit.tweets_df["favorites"] = twit.tweets_df["favorites"].map(int)
In [5]:
twit.tweets_df.sort_values("favorites", ascending=False)[:5]
Out[5]:
In [6]:
twit.tweets_df.sort_values("retweets", ascending=False)[:5]
Out[6]:
For some reason the search did not include Trump's username - random errors like this sometimes happen when querying the twitter website.
In [7]:
twit.background_path = '../jar_files_and_background/freq_table_72319443_total_words_twitter_corpus.csv'
twit.create_Background_dict()
twit.create_Stop_words()
In [8]:
twit.keep_column_of_original_tweets()
twit.lower_tweets()
twit.keep_only_unicode_tweet_text()
twit.remove_urls_from_tweets()
twit.remove_punctuation_from_tweets()
twit.drop_non_ascii_characters_from_tweets()
twit.drop_duplicate_tweets()
twit.convert_tweet_dates_to_standard()
twit.sort_tweets_by_date()
Make word frequency dataframe:
In [9]:
twit.create_word_bag()
twit.make_nltk_object_from_word_bag()
twit.create_word_freq_df(10000)
In [10]:
twit.word_freq_df.sort_values("log relative frequency", ascending = False, inplace = True)
twit.word_freq_df.head(20)
Out[10]:
In [11]:
num_words_to_plot = 32
background_cutoff = 100
twit.word_freq_df[twit.word_freq_df['background occurrences']>background_cutoff].sort_values("log relative frequency", ascending=True).set_index("word")["log relative frequency"][-num_words_to_plot:].plot.barh(figsize=(20,
num_words_to_plot/2.), fontsize=30, color="c");
plt.title("log relative frequency", fontsize=30);
ax = plt.axes();
ax.xaxis.grid(linewidth=4);
In [12]:
num_words_to_plot = 32
background_cutoff = 1000
twit.word_freq_df[twit.word_freq_df['background occurrences']>background_cutoff].sort_values("log relative frequency", ascending=True).set_index("word")["log relative frequency"][-num_words_to_plot:].plot.barh(figsize=(20,
num_words_to_plot/2.), fontsize=30, color="c");
plt.title("log relative frequency", fontsize=30);
ax = plt.axes();
ax.xaxis.grid(linewidth=4);
In [13]:
num_words_to_plot = 32
background_cutoff = 10000
twit.word_freq_df[twit.word_freq_df['background occurrences']>background_cutoff].sort_values("log relative frequency", ascending=True).set_index("word")["log relative frequency"][-num_words_to_plot:].plot.barh(figsize=(20,
num_words_to_plot/2.), fontsize=30, color="c");
plt.title("log relative frequency", fontsize=30);
ax = plt.axes();
ax.xaxis.grid(linewidth=4);
In [14]:
num_words_to_plot = 32
background_cutoff = 10000
twit.word_freq_df[twit.word_freq_df['background occurrences']>background_cutoff].sort_values("log relative frequency", ascending=False).set_index("word")["log relative frequency"][-num_words_to_plot:].plot.barh(figsize=(20,
num_words_to_plot/2.), fontsize=30, color="c");
plt.title("log relative frequency", fontsize=30);
ax = plt.axes();
ax.xaxis.grid(linewidth=4);
Trump does not post about things happening automatically.
In [15]:
twit.tweets_containing("fuck")
Out[15]:
In [ ]: